In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
In [ ]:
data = pd.read_csv(r'C:\Users\gokul\OneDrive\Desktop\DATA DC\DC SEM 2\2204 Stat and Pred Modelling\Week_15\wireless_churn.csv')
data.head()
Out[ ]:
AccountWeeks ContractRenewal DataPlan DataUsage CustServCalls DayMins DayCalls MonthlyCharge OverageFee RoamMins Churn
0 128 1 1 2.7 1 265.1 110 89.0 9.87 10.0 0
1 107 1 1 3.7 1 161.6 123 82.0 9.78 13.7 0
2 137 1 0 0.0 0 243.4 114 52.0 6.06 12.2 0
3 84 0 0 0.0 2 299.4 71 57.0 3.10 6.6 0
4 75 0 0 0.0 3 166.7 113 41.0 7.42 10.1 0
In [ ]:
#Show Key Statistics
data.describe()
Out[ ]:
AccountWeeks ContractRenewal DataPlan DataUsage CustServCalls DayMins DayCalls MonthlyCharge OverageFee RoamMins Churn
count 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000 3333.000000
mean 101.064806 0.903090 0.276628 0.816475 1.562856 179.775098 100.435644 56.305161 10.051488 10.237294 0.144914
std 39.822106 0.295879 0.447398 1.272668 1.315491 54.467389 20.069084 16.426032 2.535712 2.791840 0.352067
min 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 14.000000 0.000000 0.000000 0.000000
25% 74.000000 1.000000 0.000000 0.000000 1.000000 143.700000 87.000000 45.000000 8.330000 8.500000 0.000000
50% 101.000000 1.000000 0.000000 0.000000 1.000000 179.400000 101.000000 53.500000 10.070000 10.300000 0.000000
75% 127.000000 1.000000 1.000000 1.780000 2.000000 216.400000 114.000000 66.200000 11.770000 12.100000 0.000000
max 243.000000 1.000000 1.000000 5.400000 9.000000 350.800000 165.000000 111.300000 18.190000 20.000000 1.000000

EDA - Pandas Profiling Report¶

In [ ]:
from ydata_profiling import ProfileReport

profile = ProfileReport(data, title = 'Wireless Churn Report')
profile.to_notebook_iframe()
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Split the data to test & train for Model Comparison¶

In [ ]:
#Prepare for Models Comparison

#Create x and y variables
x = data.drop('Churn', axis=1).to_numpy()
Y = data['Churn'].to_numpy()

#Load Library for Training
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,Y,test_size = 0.2,stratify=Y,random_state = 100)

Removing Anomalies using Isolation Forest¶

In [ ]:
# Use built-in isolation forest
from sklearn.ensemble import IsolationForest

# The prediction returns 1 if sample point is inlier. If outlier prediction returns -1
clf_all_features = IsolationForest(random_state=100)
clf_all_features.fit(x_train)

#Predict if a particular sample is an outlier using all features for higher dimensional data set.
y_pred_train = clf_all_features.predict(x_train)
y_pred_train2 =np.array(list(map(lambda x: x == 1, y_pred_train)))

# Exclude suggested outlier samples for improvement of prediction power/score
x_train_mod = x_train[y_pred_train2, ]
y_train_mod = y_train[y_pred_train2, ]

#Size of Datasets
print('Original Train Dataset Size : {}'.format(len(x_train)))
print('New Train Dataset Size      : {}'.format(len(x_train_mod)))
Original Train Dataset Size : 2666
New Train Dataset Size      : 2124

Creating Logistic Regression and Naïve Bayes Models¶

In [ ]:
#Scale the Data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train2 = sc.fit_transform(x_train_mod)
x_test2 = sc.fit_transform(x_test)

x_2 = sc.fit_transform(x)

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
In [ ]:
#Construct some pipelines 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

#Create Pipeline

pipeline =[]

pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('clf', LogisticRegression(solver= 'lbfgs', class_weight= 'balanced', max_iter= 1000, random_state=100))])
pipeline.insert(0,pipe_lr)

pipe_nb = Pipeline([('scl', StandardScaler()),
                    ('clf', GaussianNB())])
pipeline.insert(1,pipe_nb)

# Set grid search params 

modelpara =[]

param_gridlogistic = {'clf__C': [0.01, 0.1, 1, 10, 100],
                      'clf__penalty': ['l2'],
                      'clf__solver':['newton-cg', 'sag', 'saga' , 'lbfgs']}
modelpara.insert(0, param_gridlogistic)

param_gridnb = {}           #Naive Bayes does not have any hyperparameters to tune
modelpara.insert(1, param_gridnb)

Plotting Learning Curve¶

In [ ]:
#Define Plot for learning curve

from sklearn.model_selection import learning_curve

def plot_learning_curves(model):
    train_sizes, train_scores, test_scores = learning_curve(estimator=model,
                                                            X=x_train_mod, 
                                                            y=y_train_mod,
                                                            train_sizes= np.linspace(0.1, 1.0, 10),
                                                            cv=10,
                                                            scoring='recall_weighted',random_state=100)
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    
    plt.plot(train_sizes, train_mean,color='blue', marker='o', 
             markersize=5, label='training accuracy')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std,
                     alpha=0.15, color='blue')

    plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5,
             label='validation accuracy')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std,
                     alpha=0.15, color='green')
    plt.grid()
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='best')
    plt.ylim([0.5, 1.01])
    plt.show()
In [ ]:
#Plot Learning Curve
print('Logistic Regression - Learning Curve')
plot_learning_curves(pipe_lr)
print('\nNaive Bayes - Learning Curve')
plot_learning_curves(pipe_nb)
Logistic Regression - Learning Curve
Naive Bayes - Learning Curve

Creating Optimized Models and ROC/AUC curves¶

In [ ]:
#Define Gridsearch Function
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix  

def Gridsearch_cv(model, params):
    
    #Cross-validation Function
    cv2=RepeatedKFold(n_splits=10, n_repeats=5, random_state=100)
        
    #GridSearch CV
    gs_clf = GridSearchCV(model, params, cv=cv2,scoring='recall_weighted')
    gs_clf = gs_clf.fit(x_train_mod, y_train_mod)
    model = gs_clf.best_estimator_
    
    # Use best model and test data for final evaluation
    y_pred = model.predict(x_test2)
    #Identify Best Parameters to Optimize the Model
    bestpara=str(gs_clf.best_params_)
    
    #Output Heading
    print('\n__________________________________________________________________________________________________')
    print('\nOptimized Model')
    print('\nModel Name:',str(pipeline.named_steps['clf']))
    
    #Output Validation Statistics
    print('\nBest Parameters:',bestpara)
    print('\n', confusion_matrix(y_test,y_pred))  
    print('\n',classification_report(y_test,y_pred))   
    
    #Transform the variables into binary (0,1) - ROC Curve
    from sklearn import preprocessing
    Forecast1=pd.DataFrame(y_pred)
    Outcome1=pd.DataFrame(y_test)
    lb1 = preprocessing.LabelBinarizer()
    OutcomeB1 =lb1.fit_transform(Outcome1)
    ForecastB1 = lb1.fit_transform(Forecast1)
    
    #Setup the ROC Curve
    from sklearn.metrics import roc_curve, auc
    from sklearn import metrics
    fpr, tpr, threshold = metrics.roc_curve(OutcomeB1, ForecastB1)
    roc_auc = metrics.auc(fpr, tpr)
    print('ROC Curve')
    #Plot the ROC Curve
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()  
In [ ]:
#Run Models

for pipeline, modelpara in zip(pipeline,modelpara):
    Gridsearch_cv(pipeline,modelpara)
c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
c:\Users\gokul\anaconda3\Lib\site-packages\sklearn\linear_model\_sag.py:349: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
__________________________________________________________________________________________________

Optimized Model

Model Name: LogisticRegression(class_weight='balanced', max_iter=1000, random_state=100)

Best Parameters: {'clf__C': 0.01, 'clf__penalty': 'l2', 'clf__solver': 'sag'}

 [[535  35]
 [ 75  22]]

               precision    recall  f1-score   support

           0       0.88      0.94      0.91       570
           1       0.39      0.23      0.29        97

    accuracy                           0.84       667
   macro avg       0.63      0.58      0.60       667
weighted avg       0.81      0.84      0.82       667

ROC Curve
__________________________________________________________________________________________________

Optimized Model

Model Name: GaussianNB()

Best Parameters: {}

 [[185 385]
 [  9  88]]

               precision    recall  f1-score   support

           0       0.95      0.32      0.48       570
           1       0.19      0.91      0.31        97

    accuracy                           0.41       667
   macro avg       0.57      0.62      0.40       667
weighted avg       0.84      0.41      0.46       667

ROC Curve

Creating Ensemble Voting Model¶

In [ ]:
#Create Voting Model - Sklearn
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_validate
from sklearn.ensemble import GradientBoostingClassifier

estimators = []

mod1 = LogisticRegression(solver= 'lbfgs', class_weight= 'balanced', max_iter= 1000, random_state=100)
estimators.append(('LogisticRegression', model1))

mod2 = GradientBoostingClassifier(random_state=100)
estimators.append(('GradientBoostingClassifier', model2))

mod3 = GaussianNB()
estimators.append(('GaussianNB', model3))


voting_clf=VotingClassifier(estimators,voting='soft')

scoring = {'acc': 'accuracy',
           'prec_macro': 'precision_macro',
           'rec_macro': 'recall_macro'}
print('\nVoting Model')
for clf in (mod1,mod2,mod3,voting_clf):
    rkfcv= clf.fit(x_train2,y_train_mod)
    ens_rkf1 = RepeatedKFold(n_splits=10, n_repeats=5, random_state=100)
    rKFcv = cross_validate(rkfcv, x_2, Y, scoring=scoring, cv=ens_rkf1)
    print(clf.__class__.__name__,round(rKFcv['test_rec_macro'].mean(),2))   
Voting Model
LogisticRegression 0.76
GradientBoostingClassifier 0.82
GaussianNB 0.67
VotingClassifier 0.81
In [ ]: